{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 导入模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from modelscope import AutoModelForCausalLM, AutoTokenizer\n",
    "from peft import PeftModel\n",
    "import torch\n",
    "\n",
    "# 原始模型路径和训练后的LoRA路径\n",
    "base_model_path = \"/root/autodl-tmp/Qwen/Qwen3-8B\"  # 原始模型路径\n",
    "lora_model_path = \"./lora_model\"                    # 你保存的LoRA模型路径\n",
    "\n",
    "# ---------- 1. 加载分词器 ----------\n",
    "tokenizer = AutoTokenizer.from_pretrained(lora_model_path)  # 直接加载你保存的分词器\n",
    "\n",
    "# ---------- 2. 加载基础模型 ----------\n",
    "base_model = AutoModelForCausalLM.from_pretrained(\n",
    "    base_model_path,\n",
    "    torch_dtype=torch.bfloat16,\n",
    "    device_map=\"auto\"  # 自动分配设备（GPU/CPU）\n",
    ")\n",
    "\n",
    "# ---------- 3. 加载LoRA适配器 ----------\n",
    "model = PeftModel.from_pretrained(base_model, lora_model_path)\n",
    "model.eval()  # 切换到推理模式"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 保存模型的方式"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 保存为float16格式（用于VLLM）\n",
    "# 支持多种保存方式：merged_16bit（float16）、merged_4bit（int4）或lora（适配器）\n",
    "\n",
    "# Merge to 16bit\n",
    "if False:\n",
    "    model.save_pretrained_merged(\"model\", tokenizer, save_method = \"merged_16bit\",)\n",
    "if False: # 上传到HuggingFace Hub\n",
    "    model.push_to_hub_merged(\"hf/model\", tokenizer, save_method = \"merged_16bit\", token = \"\")\n",
    "\n",
    "# 保存为4位精度\n",
    "if True:\n",
    "    model.save_pretrained_merged(\"model\", tokenizer, save_method = \"merged_4bit_forced\",) # 改为_forced版本\n",
    "if True: # 上传到HuggingFace Hub\n",
    "    model.push_to_hub_merged(\"yukisama123/Qwen3-vLLM\", tokenizer, save_method = \"merged_4bit_forced\", token = \"\") # 同样改为_forced版本\n",
    "\n",
    "# 仅保存LoRA适配器\n",
    "if False:\n",
    "    model.save_pretrained_merged(\"model\", tokenizer, save_method = \"lora\",)\n",
    "if False: # 上传到HuggingFace Hub\n",
    "    model.push_to_hub_merged(\"hf/model\", tokenizer, save_method = \"lora\", token = \"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# GGUF / llama.cpp 格式转换\n",
    "# 支持多种量化方法，如q8_0、q4_k_m、q5_k_m等\n",
    "\n",
    "# F16（Float16）格式\n",
    "\n",
    "# 精度类型：半精度浮点数（16位浮点数）\n",
    "# 内存占用：比原始FP32（32位浮点数）减少约50%的存储空间\n",
    "# 精度保留：保留了相对较高的数值精度，损失较小\n",
    "# 推理性能：比FP32快，但比更低位量化格式慢\n",
    "# 适用场景：当需要在内存使用和模型精度之间取得平衡时使用\n",
    "\n",
    "# Q4_K_M格式\n",
    "\n",
    "# 精度类型：混合4位量化格式（是GGUF量化方案的一种）\n",
    "# 内存占用：比F16减少约75%的存储空间，比原始FP32减少约87.5%\n",
    "# 量化策略：针对不同权重采用不同的量化策略\n",
    "\n",
    "# 对注意力机制中的WV矩阵和前馈网络中的W2矩阵的一半使用Q6_K量化\n",
    "# 对其余权重使用Q4_K量化\n",
    "\n",
    "# 精度与速度：牺牲一定精度以获得更小的文件大小和更快的推理速度\n",
    "# 适用场景：适合在资源受限设备上运行模型，如个人电脑或移动设备\n",
    "\n",
    "# # Save to 8bit Q8_0\n",
    "# if False:\n",
    "#     model.save_pretrained_gguf(\"model\", tokenizer,)\n",
    "# # Remember to go to https://huggingface.co/settings/tokens for a token!\n",
    "# # And change hf to your username!\n",
    "# if False:\n",
    "#     model.push_to_hub_gguf(\"hf/model\", tokenizer, token = \"\")\n",
    "\n",
    "# # 保存为16位GGUF\n",
    "# if False:\n",
    "#     model.save_pretrained_gguf(\"model\", tokenizer, quantization_method = \"f16\")\n",
    "# if False: # 上传到HuggingFace Hub\n",
    "#     model.push_to_hub_gguf(\"hf/model\", tokenizer, quantization_method = \"f16\", token = \"\")\n",
    "\n",
    "# # 保存为q4_k_m格式GGUF\n",
    "if True:\n",
    "    model.save_pretrained_gguf(\"model\", tokenizer, quantization_method = \"q4_k_m\")\n",
    "if True:# 上传到HuggingFace Hub\n",
    "    model.push_to_hub_gguf(\"yukisama123/Qwen3-GGUF\", tokenizer, quantization_method = \"q4_k_m\", token = \"\")\n",
    "\n",
    "# # 保存多种GGUF格式（批量导出更高效）\n",
    "# if False:\n",
    "#     model.push_to_hub_gguf(\n",
    "#         \"hf/model\", # Change hf to your username!\n",
    "#         tokenizer,\n",
    "#         quantization_method = [\"q4_k_m\", \"q8_0\", \"q5_k_m\",],\n",
    "#         token = \"\", # Get a token at https://huggingface.co/settings/tokens\n",
    "#     )"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
